package com.kdtech.analyse.news;
import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.HtmlCleaner;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.kdtech.analyse.JSoupUtils;
import com.kdtech.analyse.tool.ParseLogic;
import com.kdtech.analyse.tool.ParseTool;
import com.kdtech.analyse.tool.SubstrLogic;
import com.kdtech.analyse.tool.SubstrType;
import com.kdtech.crawler.CrawlHTML;
import com.kdtech.crawler.at.UrlArgumentTop;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.RegexUtils;

public class ChinaNewsNewsAnalyse implements AnalyseNews {

	
	public boolean isDetailPage(String url) {
		//http://www.chinanews.com/gn/2012/11-23/4352938.shtml
		//http://game.chinanews.com/mmo/201211/2224736.html
		if(url.startsWith("http://bbs.")){
			return false;
		}
		url=url.replaceAll("-", "/");
		String[] regex={
				"http://.*.chinanews.com/[0-9]*/[0-9]*/[0-9]*.html",
				"http://.*.chinanews.com/.*/[0-9]*-[0-9]*/[0-9]*/[0-9]*.[s]?html",
				"http://.*.chinanews.com/.*/[0-9]*/[0-9]*[0-9]*/[0-9]*.[s]?html",
				"http://.*chinanews[.]com/.*/[0-9]{6,8}/[0-9]+.[s]*html",
				"http://www.[a-z]*.chinanews.com/news1-[0-9]*.html",
				"http://www.[a-z]*.chinanews.com/news1.aspx\\?id=[0-9]*",
				"http://.*[.]chinanews.com.cn/.*/html/[0-9]+/[0-9]+.html",
				"http://www.hi.chinanews.com/.*/[0-9]*/[0-9]*/[0-9]*/[0-9]*_[0-9]*.html"


		};

		return RegexUtils.matchAny(url, regex);
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		String html=urlMeta.getHtml();
		if ( html== null) {
		}
		String url=urlMeta.getUrl();
		String title="";
		String content="";;
		Long date=0l;
		NewsMeta meta=new NewsMeta();
		meta.setUrl(url);
		meta.setType(0);
		Document doc=Jsoup.parse(html);
		urlMeta=null;

		/*
		 * 解析新闻标题
		 */
		ParseTool tool=new ParseTool(doc);
		tool.addParseLogic(new ParseLogic("#window .window1"));
		tool.addParseLogic(new ParseLogic("div#cont_1_1_2>h1"));
		tool.addParseLogic(new ParseLogic("div.news_title"));
		tool.addParseLogic(new ParseLogic("div.left_bt>ul"));
		tool.addParseLogic(new ParseLogic("div.left_bt>h1"));
		tool.addParseLogic(new ParseLogic("div.article_title"));
		tool.addParseLogic(new ParseLogic("title", new SubstrLogic(SubstrType.before, "_"))) ;
		title=tool.parse();


		/*
		 * 解析时间
		 */
		tool.addParseLogic(new ParseLogic("div.left-time"));
		tool.addParseLogic(new ParseLogic("div.news_info"));
		tool.addParseLogic(new ParseLogic("div.article_info"));
		tool.addParseLogic(new ParseLogic("div.artInfo"));
		tool.addParseLogic(new ParseLogic("div[style=text-align:right;font-size:12px;]"));
		date= tool.parseDate();
		if (date==null){
			date=DateUtils.matchDate(url);
		}

		content=JSoupUtils.selectContent(url, doc,"div.left_zw","div.news_content","div.detail_font","div#artibody","div#viewText","div.t3","div.window25 span");

		if(url.startsWith("http://www.jl.chinanews.com/")){
			title=doc.select("td.black_titnews").text();
			date=DateUtils.matchDate(doc.select("td.pad1.pad10").text());
			content=JSoupUtils.selectContent(url, doc,"td.dis1");
		}else if(url.startsWith("http://www.hn.chinanews.com/")){
			title=doc.select("td[height=50]").text();
			if(StringUtils.isBlank(title)){
				title=doc.select("div#biaoti").text();
				if(StringUtils.isBlank(title)){
					title=doc.select("div.child_left_title").text();
				}
			}
			date=DateUtils.matchDate(doc.select("div#anthor").text());
			if(date==null){
				date=DateUtils.matchDate(doc.select("div#sendday").text());
				if(date==null){
					date=DateUtils.matchDate(doc.select("div.child_left_desc").text());
				}
			}
			content=JSoupUtils.selectContent(url, doc,"div#zoom","div.child_left_text");
		}else if(url.startsWith("http://www.hi.chinanews.com/")){
			title=doc.select("div.biaoti").text();
			date=DateUtils.matchDate(doc.select("span.left").text());
			if(date==null){
				date=DateUtils.matchDate(doc.select("div.zwbei01").text());
			}
			content=JSoupUtils.selectContent(url, doc,"div#ArticleBody p");
		}else if(url.startsWith("http://www.bj.chinanews.com/")){
			title=doc.select("h1").text();
			date=DateUtils.matchDate(doc.select(".branch_con_title").text());
			content=JSoupUtils.selectContent(url, doc,".branch_con_text");
		}else if(url.startsWith("http://www.gs.chinanews.com/")){
			title=doc.select("div.dd").text();
			date=DateUtils.matchDate(doc.select("div.xdd").text());
			content=JSoupUtils.selectContent(url, doc,"div.wz");
		}else if(url.startsWith("http://www.gz.chinanews.com/")){
			title=doc.select("div.left_bt").text();
			date=DateUtils.matchDate(doc.select("div.left_time").text());
			content=JSoupUtils.selectContent(url, doc,"div.left_zw");
		}else if(url.startsWith("http://www.hb.chinanews.com/")){
			title=doc.select("h1").text();
			date=DateUtils.matchDate(doc.select("div.p1").text());
			if(date==null){
				date=JSoupUtils.matchDate(doc,"发布");
			}
			content=JSoupUtils.selectContent(url, doc,"#content_text p","div.content");
		}else if(url.startsWith("http://www.heb.chinanews.com/")){
			title=doc.select("td[height=80]").text();
			date=DateUtils.matchDate(doc.select("td.hui12").text());
			content=JSoupUtils.selectContent(url, doc,"td.hui15");
		}else if(url.startsWith("http://www.sc.chinanews.com/")){
			title= doc.select("div.c_title").text();
			date=DateUtils.matchDate(doc.select("div.f_l").text());
			content=HtmlCleaner.getContentHtml(url ,doc.select("div#txt_content"));
		}else if(url.startsWith("http://www.ln.chinanews.com/")){
			title= doc.select("h1.title").text();
			date=DateUtils.matchDate(doc.select("div.info").text());
			content=HtmlCleaner.getContentHtml(url ,doc.select("div.txtcontent"));
		}else if(url.startsWith("http://www.fj.chinanews.com/")){
			title= doc.select("div.con_titl").text();
			date=DateUtils.matchDate(doc.select("li.con_time_li1").text());
			content=HtmlCleaner.getContentHtml(url ,doc.select("div.con_con"));
		}else if(url.startsWith("http://www.cq.chinanews.com/")){
			title= doc.select("div#title").text();
			date=DateUtils.matchDate(doc.select("div#title_1").text());
			content=HtmlCleaner.getContentHtml(url ,doc.select("div#fontzoom"));
		}
			
		if(StringUtils.isBlank(content)){
			content=doc.select(".article-detail").html();
		}

		if(StringUtils.isBlank(content)){
			content=HtmlCleaner.getContentHtml(url ,doc.select("div.mcontent"));
		}

		String author=doc.select("div#con div.left-time div.left-t").text();

		if(author!=null && author.indexOf("来源：")!=-1){
			author=StringUtils.substringBetween(author, "来源：", "　");
		}
		if(StringUtils.isBlank(author)){
			author=doc.select("div.con_time").text();
			if(StringUtils.isBlank(author)){
				author=doc.select("p.fabulaiyuan").text();
			}
		author=StringUtils.substringAfter(author, "来源：");
		}

		if (StringUtils.isBlank(author)){
			author=JSoupUtils.matchAuthor(doc, "来源：");
		}
		meta.setAuthor(author);
		meta.setTitle(title);
		meta.setContent(content);
		meta.setDate(date);
		return meta;
	}
	
	public NewsMeta Update(NewsMeta meta) {
		return null;
	}
	public static void main(String[] args) {
		String url="http://www.jx.chinanews.com/news/2016/1108/3416.html";
		url = UrlArgumentTop.FromatUrl(url);
		UrlMeta urlMeta=CrawlHTML.responseToURL(url);
		ChinaNewsNewsAnalyse cnhubeiNewsAnalyse=new ChinaNewsNewsAnalyse();
		boolean detailPage=cnhubeiNewsAnalyse.isDetailPage(url);
		if(detailPage){
		NewsMeta parserHtml =cnhubeiNewsAnalyse.parserHtml(urlMeta);
		System.out.println(parserHtml);
		}else{
			System.out.println("不符合正则");
		}
	}


	
	public boolean isNeedUpdate(){
		return false;
	}
}
